#include <iostream>
#include <unordered_set>
#include <string>
#include <fstream>
#include <list>

using word_set = std::unordered_set < std::string > ;
//一个中文长度
constexpr size_t char_len = 2;
//字典中长词的长度
constexpr size_t MAX_KEYWARD_SIZE = 12 * char_len;


void read_distionary(word_set& set,
    const std::string& file)
{
    std::fstream  is{ file };
    std::string str;
    while (!is.eof())
    {
        is >> str; //读取第一行第一个 
        set.insert(str); //放到集合中
        std::getline(is, str); //剩下的忽略
    }
}

auto forward_segment(const word_set& set, const std::string& str)
{
    std::list<std::string> vec;
    for (int i = 0; i < str.size(); i += char_len)
    {
        size_t end = i + MAX_KEYWARD_SIZE;
        if (end > str.size())
        {
            end = str.size();
        }
        for (int j =  end; j >= i; j -= char_len)
        {
            auto cur = str.substr(i, j - i);
            if (set.find(cur) != set.end())
            {
                vec.push_back(cur);
                i += cur.size() - char_len;
                break;
            }
        }
    }
    return vec;
}

auto backward_segment(const word_set& set, const std::string& str)
{
    std::list<std::string> vec;
    for (int i = str.size(); i >= 0; i -= char_len)
    {
        int end = i - MAX_KEYWARD_SIZE;
        if (end < 0) end = 0;
        for (int j = end; j < i; j += char_len)
        {
            auto cur = str.substr(j, i - j);
            if (set.find(cur) != set.end())
            {
                vec.push_front(cur);
                i = j + char_len;
                break;
            }
        }
    }
    return vec;
}

auto count_single_char(const std::list<std::string>& set)
{
    size_t count = 0;
    for (auto& e : set)
    {
        if (e.size() == char_len)
        {
            count++;
        }
    }
    return count;
}

auto  bidirectional_segment(const word_set& set, const std::string& str)
{
    auto fs = forward_segment(set, str);
    auto bs = backward_segment(set, str);
    // 分词数量少优先
    if (fs.size() > bs.size())
    {
        return fs;
    }
    if (fs.size() < bs.size())
    {
        return bs;
    }
    //单字词少 次先
    if (count_single_char(fs) > count_single_char(bs))
    {
        return bs;
    }
    return fs;
}

int main()
{
    word_set set;
    std::string str;

    std::cout.sync_with_stdio(false);

    std::cout << "输入分词字符串：";
    std::cin >> str;

    read_distionary(set, "CoreNatureDictionary.mini.txt");

    auto re = bidirectional_segment(set, str);
    std::cout << "分词结果" << std::endl;
    for (auto& e : re)
    {
        std::cout << e << " ";
    }
    std::cout << std::endl;
    return 0;
}
